# Descriptives PRESVEN data

library(tidyverse)
library(rempsyc)

ven <- readRDS("ven_elec_2006_2024_final.rds")

library(dplyr)
library(ggplot2)

# Calculate variation for full 2024 dataset (multiple polling stations per center)
variation_2024_full <- ven %>%
  filter(year == 2024) %>%
  group_by(estado, municipio, centro) %>%
  summarise(
    n_mesas = n(),
    sd_of_p = sd(of_p, na.rm = TRUE),
    sd_op_p = sd(op_p, na.rm = TRUE),
    range_of_p = max(of_p, na.rm = TRUE) - min(of_p, na.rm = TRUE),
    range_op_p = max(op_p, na.rm = TRUE) - min(op_p, na.rm = TRUE),
    mean_of_p = mean(of_p, na.rm = TRUE),
    mean_op_p = mean(op_p, na.rm = TRUE),
    cv_of_p = sd_of_p / mean_of_p,
    cv_op_p = sd_op_p / mean_op_p
  ) %>%
  ungroup()

# Now, simulate a "one-station-per-center" scenario for 2024
variation_2024_sample <- ven %>%
  filter(year == 2024) %>%
  group_by(estado, municipio, centro) %>%
  slice_sample(n = 1) %>%  # Select only one polling station per center
  ungroup() %>%
  summarise(
    sd_of_p = sd(of_p, na.rm = TRUE),
    sd_op_p = sd(op_p, na.rm = TRUE),
    range_of_p = max(of_p, na.rm = TRUE) - min(of_p, na.rm = TRUE),
    range_op_p = max(op_p, na.rm = TRUE) - min(op_p, na.rm = TRUE),
    mean_of_p = mean(of_p, na.rm = TRUE),
    mean_op_p = mean(op_p, na.rm = TRUE),
    cv_of_p = sd_of_p / mean_of_p,
    cv_op_p = sd_op_p / mean_op_p
  )

# Compare the variation summary
variation_comparison <- bind_rows(
  variation_2024_full %>% mutate(dataset = "Full 2024"),
  variation_2024_sample %>% mutate(dataset = "One Station per Center (2024)")
)

# Plot coefficient of variation (CV) comparison
ggplot(variation_comparison, aes(x = dataset, y = cv_of_p, fill = dataset)) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_minimal() +
  labs(
    title = "Variation in Chavismo Vote Share Within Polling Centers",
    subtitle = "Comparing Full 2024 vs. One Station per Center",
    x = "",
    y = "Coefficient of Variation (CV)"
  )


# Show the results
print(variation_summary)

names(ven)

# Compute intra-polling center variation for 2024
variation_2024 <- ven %>%
  filter(year == 2024) %>%
  group_by(estado, municipio, centro) %>%
  summarise(
    n_mesas = n(),  # Number of polling stations per center
    var_of_p = ifelse(n() > 1, var(of_p, na.rm = TRUE), NA),  # Variance only if >1 polling station
    var_op_p = ifelse(n() > 1, var(op_p, na.rm = TRUE), NA),
    var_otro_p = ifelse(n() > 1, var(otro_p, na.rm = TRUE), NA),
    sd_of_p = ifelse(n() > 1, sd(of_p, na.rm = TRUE), NA),
    sd_op_p = ifelse(n() > 1, sd(op_p, na.rm = TRUE), NA),
    sd_otro_p = ifelse(n() > 1, sd(otro_p, na.rm = TRUE), NA),
    mean_of_p = mean(of_p, na.rm = TRUE),
    mean_op_p = mean(op_p, na.rm = TRUE),
    mean_otro_p = mean(otro_p, na.rm = TRUE)
  ) %>%
  ungroup()

summary_2024 <- variation_2024 %>%
  reframe(
    Chavismo = c(mean(var_of_p, na.rm = TRUE), mean(sd_of_p, na.rm = TRUE), mean(mean_of_p, na.rm = TRUE)),
    Opposition = c(mean(var_op_p, na.rm = TRUE), mean(sd_op_p, na.rm = TRUE), mean(mean_op_p, na.rm = TRUE)),
    Other = c(mean(var_otro_p, na.rm = TRUE), mean(sd_otro_p, na.rm = TRUE), mean(mean_otro_p, na.rm = TRUE))
  ) %>%
  mutate(Metric = c("Variance", "Standard Deviation", "Mean Vote Share")) %>% 
  pivot_longer(cols = -Metric, names_to = "Political Block", values_to = "Value") %>% 
  spread(key = Metric, value = Value)

summary_table_2024 <- gt(summary_2024) %>%
  tab_header(title = "Intra-Polling Center Variation (2024 Election)") %>%
  fmt_number(
    columns = c(`Mean Vote Share`, `Standard Deviation`, `Variance`),
    decimals = 2
  ) %>%
  tab_source_note(source_note = "Data from 2024 Venezuelan Presidential Election.")

summary_table_2024

gtsave(summary_table_2024, filename = "var_summary_table_2024.docx")

library(ggplot2)

ggplot(variation_2024) +
  geom_density(aes(x = sd_of_p, fill = "Chavismo"), alpha = 0.5, color = "red") +
  geom_density(aes(x = sd_op_p, fill = "Opposition"), alpha = 0.5, color = "blue") +
  geom_density(aes(x = sd_otro_p, fill = "Other"), alpha = 0.5, color = "grey") +
  scale_fill_manual(values = c("Chavismo" = "red", "Opposition" = "blue", "Other" = "grey")) +
  geom_vline(aes(xintercept = mean(sd_of_p, na.rm = TRUE), color = "Chavismo"), linetype = "dashed") +
  geom_vline(aes(xintercept = mean(sd_op_p, na.rm = TRUE), color = "Opposition"), linetype = "dashed") +
  geom_vline(aes(xintercept = mean(sd_otro_p, na.rm = TRUE), color = "Other"), linetype = "dashed") +
  theme_minimal() +
  labs(
    title = "Density Plot of Standard Deviation in Vote Share",
    subtitle = "2024 Presidential Election",
    x = "SD",
    y = "Density",
    fill = "Political Block",
    caption = "Source: Compiled by the authors from data from 2024 Venezuelan Presidential Election. 
    Dashed lines represent the mean SD for each group."
  ) +
  theme(plot.caption = element_text(hjust = 0, face = "italic")) +
  scale_x_continuous(n.breaks = 10) +
  guides(colour = "none")

